{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6",
   "metadata": {},
   "source": [
    "# Acquiring Your Data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "83ad2114-5ed8-4a90-85fa-adea5eda4392",
   "metadata": {},
   "source": [
    "## Reading Data From CSV Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a360772e-7829-4c15-9af9-d4596efc7351",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m pip install pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e52f486-232e-440b-8585-90416e4300c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "797f69eb-3108-45d3-9a67-58c43593abf1",
   "metadata": {},
   "outputs": [],
   "source": [
    "james_bond_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e47c1f9b-b390-4035-956b-622615b57f32",
   "metadata": {},
   "source": [
    "## Reading Data From Other Sources"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d85aee9-cfeb-460b-9fe8-f3c7e7dfb764",
   "metadata": {},
   "source": [
    "### Reading JSON"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7465cd11-dad4-4741-9372-f825b28c33d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "james_bond_data = pd.read_json(\"james_bond_data.json\").convert_dtypes()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47a0e4a6-0ed9-4253-9833-0ad22c49b968",
   "metadata": {},
   "source": [
    "### Reading Excel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0364b81-64a0-4098-89fc-e58bd6d68257",
   "metadata": {},
   "outputs": [],
   "source": [
    "! python -m pip install openpyxl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8302139f-52dc-4f95-aa9a-96040ae5d82b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "james_bond_data = pd.read_excel(\"james_bond_data.xlsx\").convert_dtypes()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "be4a1143-c966-4056-8a5e-3bdebe2a9b1f",
   "metadata": {},
   "source": [
    "### Reading Parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f36ef600-e6ba-4cc6-9ee3-0cbf369a4be2",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m pip install pyarrow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c86284a2-9073-4240-b4d5-5e8b0373fc27",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "james_bond_data = pd.read_parquet(\"james_bond_data.parquet\").convert_dtypes()\n",
    "\n",
    "james_bond_data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69f884c2-92e8-4db3-bd63-84007f654808",
   "metadata": {},
   "source": [
    "### Scraping HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b902722d-9648-4124-80b0-64004342170d",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m pip install lxml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fb2ff9c-3030-4f4a-be30-c2ab68452a21",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "james_bond_data_html = pd.read_html(\n",
    "    \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\"\n",
    ")\n",
    "james_bond_tables = james_bond_data_html[1].convert_dtypes()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31068de2-9864-434a-9652-b115d1131684",
   "metadata": {},
   "source": [
    "# Cleansing Your Data With Python"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0dcca3b-6e71-481d-a071-6218012db962",
   "metadata": {},
   "source": [
    "## Creating Meaningful Column Names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d70997b9-3c75-4165-b034-8544bd084c04",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_column_names = {\n",
    "    \"Release\": \"release_date\",\n",
    "    \"Movie\": \"movie_title\",\n",
    "    \"Bond\": \"bond_actor\",\n",
    "    \"Bond_Car_MFG\": \"car_manufacturer\",\n",
    "    \"US_Gross\": \"income_usa\",\n",
    "    \"World_Gross\": \"income_world\",\n",
    "    \"Budget ($ 000s)\": \"movie_budget\",\n",
    "    \"Film_Length\": \"film_length\",\n",
    "    \"Avg_User_IMDB\": \"imdb\",\n",
    "    \"Avg_User_Rtn_Tom\": \"rotten_tomatoes\",\n",
    "    \"Martinis\": \"martinis_consumed\",\n",
    "    \"Kills_Bond\": \"bond_kills\",\n",
    "}\n",
    "\n",
    "data = james_bond_data.rename(columns=new_column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "937b9121-b7ae-4f7e-800d-bfcc2689c98a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e432b28e-257b-422b-b2f8-06f41608391b",
   "metadata": {},
   "source": [
    "## Dealing With Missing Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d497e64c-aa7e-4d09-8de1-f529939d58f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b29d5a34-c930-4ce2-898c-b9e8aa7f771d",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[data.isna().any(axis=\"columns\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1db9201a-11c1-4cdd-9625-d70cee736191",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = james_bond_data.rename(columns=new_column_names).combine_first(\n",
    "    pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4af51fb-fd1f-4570-b16f-6f20e0b65473",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[data.isna().any(axis=\"columns\")]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6297c81-4c63-4eff-95e3-4a944bb5fe03",
   "metadata": {},
   "source": [
    "## Handling Financial Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "916c91b8-7888-40fc-bce7-247837508adf",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"income_usa\", \"income_world\", \"movie_budget\", \"film_length\"]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "880e4710-1c11-4de2-a2c3-97a9672ce6f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae9c1d1b-a620-43c5-a199-eb6a7bff7ce2",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        income_world=lambda data: (\n",
    "            data[\"income_world\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        movie_budget=lambda data: (\n",
    "            data[\"movie_budget\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38f36603-a797-4094-829f-fcfdbe2e80ed",
   "metadata": {},
   "source": [
    "## Correcting Invalid Data Types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8331b98e-169f-4d3b-9b88-0ece7ddc8dea",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        income_world=lambda data: (\n",
    "            data[\"income_world\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        movie_budget=lambda data: (\n",
    "            data[\"movie_budget\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        film_length=lambda data: (\n",
    "            data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n",
    "        ),\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6340b1f-3b1c-42e6-9b69-e981f645d77b",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"income_usa\", \"income_world\", \"movie_budget\", \"film_length\"]].info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7025fbd2-ce44-4efe-88c9-9f51830776c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"income_usa\", \"income_world\", \"movie_budget\", \"film_length\"]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d45b9b42-7c71-422f-9ddb-ea659e5385c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"release_date\"]].info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f5dacf7-2f6c-47f4-b875-7d36f2251627",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"release_date\"]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed0ead0e-7310-4c82-86d5-2480a95f1525",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        income_world=lambda data: (\n",
    "            data[\"income_world\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        movie_budget=lambda data: (\n",
    "            data[\"movie_budget\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        film_length=lambda data: (\n",
    "            data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n",
    "        ),\n",
    "        release_date=lambda data: pd.to_datetime(\n",
    "            data[\"release_date\"], format=\"%B, %Y\"\n",
    "        ),\n",
    "        release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f147876-7348-43e9-ac6a-3f3df6ee2af9",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"release_date\", \"release_year\"]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47d4868a-94d8-4d36-85b9-b0c9a6203a8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"release_date\", \"release_year\"]].info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2c7922a-916e-4e01-829b-77cbb2205153",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89653d81-3bcd-4078-83cb-ad4b2fa560e6",
   "metadata": {},
   "source": [
    "## Fixing Inconsistencies in Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47a41ef3-751a-41ed-869d-9f2c45509196",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"income_usa\", \"income_world\", \"movie_budget\"]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc483320-7895-4368-a672-b98f8d0c9755",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        income_world=lambda data: (\n",
    "            data[\"income_world\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        movie_budget=lambda data: (\n",
    "            data[\"movie_budget\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "            * 1000\n",
    "        ),\n",
    "        film_length=lambda data: (\n",
    "            data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n",
    "        ),\n",
    "        release_date=lambda data: pd.to_datetime(\n",
    "            data[\"release_date\"], format=\"%B, %Y\"\n",
    "        ),\n",
    "        release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6cbd7ea-e168-442e-8dd9-e2955288fa57",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"income_usa\", \"income_world\", \"movie_budget\"]].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bdaa8b1-9f2e-46a5-b53a-c1ae4c201c99",
   "metadata": {},
   "source": [
    "## Correcting Spelling Errors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e442e51a-28fd-42d7-94b0-aaf1abe5d9a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"bond_actor\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9863aa7-b5db-4ab1-be63-727ff437b63b",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        income_world=lambda data: (\n",
    "            data[\"income_world\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        movie_budget=lambda data: (\n",
    "            data[\"movie_budget\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "            * 1000\n",
    "        ),\n",
    "        film_length=lambda data: (\n",
    "            data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n",
    "        ),\n",
    "        release_date=lambda data: pd.to_datetime(\n",
    "            data[\"release_date\"], format=\"%B, %Y\"\n",
    "        ),\n",
    "        release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n",
    "        bond_actor=lambda data: (\n",
    "            data[\"bond_actor\"]\n",
    "            .str.replace(\"Shawn\", \"Sean\")\n",
    "            .str.replace(\"MOORE\", \"Moore\")\n",
    "        ),\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e313152b-92b4-43a8-8483-637281a1f04d",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"bond_actor\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a26b138d-72e5-4e15-a875-ee65023545d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"car_manufacturer\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd4ae142-e339-4601-b0a4-84375eb28c02",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        income_world=lambda data: (\n",
    "            data[\"income_world\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        movie_budget=lambda data: (\n",
    "            data[\"movie_budget\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "            * 1000\n",
    "        ),\n",
    "        film_length=lambda data: (\n",
    "            data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n",
    "        ),\n",
    "        release_date=lambda data: pd.to_datetime(\n",
    "            data[\"release_date\"], format=\"%B, %Y\"\n",
    "        ),\n",
    "        release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n",
    "        bond_actor=lambda data: (\n",
    "            data[\"bond_actor\"]\n",
    "            .str.replace(\"Shawn\", \"Sean\")\n",
    "            .str.replace(\"MOORE\", \"Moore\")\n",
    "        ),\n",
    "        car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n",
    "            \"Astin\", \"Aston\"\n",
    "        ),\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c596022b-02a4-40c0-ac5f-d0b0643a7a4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"car_manufacturer\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50c80bc8-fdb9-4c28-af5a-cd6b66c7a01d",
   "metadata": {},
   "source": [
    "## Checking For Invalid Outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8260f6b1-6d7f-4338-95b7-8946d69a92e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"film_length\", \"martinis_consumed\"]].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c73fe06b-5f42-4357-9b0f-2e460bf0dacf",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        income_world=lambda data: (\n",
    "            data[\"income_world\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        movie_budget=lambda data: (\n",
    "            data[\"movie_budget\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "            * 1000\n",
    "        ),\n",
    "        film_length=lambda data: (\n",
    "            data[\"film_length\"]\n",
    "            .str.removesuffix(\"mins\")\n",
    "            .astype(\"Int64\")\n",
    "            .replace(1200, 120)\n",
    "        ),\n",
    "        release_date=lambda data: pd.to_datetime(\n",
    "            data[\"release_date\"], format=\"%B, %Y\"\n",
    "        ),\n",
    "        release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n",
    "        bond_actor=lambda data: (\n",
    "            data[\"bond_actor\"]\n",
    "            .str.replace(\"Shawn\", \"Sean\")\n",
    "            .str.replace(\"MOORE\", \"Moore\")\n",
    "        ),\n",
    "        car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n",
    "            \"Astin\", \"Aston\"\n",
    "        ),\n",
    "        martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n",
    "            -6, 6\n",
    "        ),\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2abb5b80-48be-4a00-9483-4732b9a5d802",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"film_length\", \"martinis_consumed\"]].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e129b32-5e66-41cb-b938-8fd58bb94116",
   "metadata": {},
   "source": [
    "## Removing Duplicate Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be7aad8b-ef3f-48a6-a9a0-de909133921f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[data.duplicated(keep=False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c98c7640-1472-4869-9fdd-f070d665ae1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        income_world=lambda data: (\n",
    "            data[\"income_world\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        movie_budget=lambda data: (\n",
    "            data[\"movie_budget\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "            * 1000\n",
    "        ),\n",
    "        film_length=lambda data: (\n",
    "            data[\"film_length\"]\n",
    "            .str.removesuffix(\"mins\")\n",
    "            .astype(\"Int64\")\n",
    "            .replace(1200, 120)\n",
    "        ),\n",
    "        release_date=lambda data: pd.to_datetime(\n",
    "            data[\"release_date\"], format=\"%B, %Y\"\n",
    "        ),\n",
    "        release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n",
    "        bond_actor=lambda data: (\n",
    "            data[\"bond_actor\"]\n",
    "            .str.replace(\"Shawn\", \"Sean\")\n",
    "            .str.replace(\"MOORE\", \"Moore\")\n",
    "        ),\n",
    "        car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n",
    "            \"Astin\", \"Aston\"\n",
    "        ),\n",
    "        martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n",
    "            -6, 6\n",
    "        ),\n",
    "    )\n",
    "    .drop_duplicates(ignore_index=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff05e0ae-4f9b-47a7-87f1-fb7630fabddc",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[data.duplicated(keep=False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1216a25-4791-4601-83ba-62513e4cc880",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"movie_title\"].value_counts().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ba17e3f-3ce1-4885-a104-f60d254d9feb",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"bond_actor\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52db1351-36ed-4104-a999-345ebbc62214",
   "metadata": {},
   "source": [
    "## Storing Your Cleansed Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "575a774e-6913-41fb-8ff9-4d786f478007",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0",
   "metadata": {},
   "source": [
    "# Performing Data Analysis Using Python"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e",
   "metadata": {},
   "source": [
    "## Performing a Regression Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m pip install matplotlib scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27d0a3dd-e71a-4b8a-883c-40cb5c001f7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "fig, ax = plt.subplots()\n",
    "ax.scatter(data[\"imdb\"], data[\"rotten_tomatoes\"])\n",
    "ax.set_title(\"Scatter Plot of Ratings\")\n",
    "ax.set_xlabel(\"Average IMDB Rating\")\n",
    "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n",
    "# fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "x = data.loc[:, [\"imdb\"]]\n",
    "y = data.loc[:, \"rotten_tomatoes\"]\n",
    "\n",
    "model = LinearRegression()\n",
    "model.fit(x, y)\n",
    "\n",
    "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n",
    "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n",
    "y_pred = model.predict(x)\n",
    "\n",
    "fig, ax = plt.subplots()\n",
    "ax.scatter(x, y)\n",
    "ax.plot(x, y_pred, color=\"red\")\n",
    "ax.text(7.25, 5.5, r_squared, fontsize=10)\n",
    "ax.text(7.25, 7, best_fit, fontsize=10)\n",
    "ax.set_title(\"Scatter Plot of Ratings\")\n",
    "ax.set_xlabel(\"Average IMDb Rating\")\n",
    "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n",
    "# fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b38df412-c320-49fb-93ae-e253405537a8",
   "metadata": {},
   "source": [
    "## Investigating a Statistical Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "938e5942-e57f-4e41-99f1-215cfb37d0df",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "length = data[\"film_length\"].value_counts(bins=7).sort_index()\n",
    "length.plot.bar(\n",
    "    ax=ax,\n",
    "    title=\"Film Length Distribution\",\n",
    "    xlabel=\"Time Range (mins)\",\n",
    "    ylabel=\"Count\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff4e9955-baf4-48eb-b032-fbf55f439194",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"film_length\"].agg([\"min\", \"max\", \"mean\", \"std\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870",
   "metadata": {},
   "source": [
    "## Finding No Relationship"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bb83374-347f-4cf6-bc21-8180a003371d",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "ax.scatter(data[\"imdb\"], data[\"bond_kills\"])\n",
    "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n",
    "ax.set_xlabel(\"Average IMDb Rating\")\n",
    "ax.set_ylabel(\"Kills by Bond\")\n",
    "# fig.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
